library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.5
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(pwr)
library(knitr)
library(kableExtra)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(extrafont)
## Registering fonts with R
library(ggrepel)

First we’ll do an F-test for equal variances to see if we can overfide the default Welch approximation.

# F-test for equal variances
# 
chickwts_f <- chickwts %>% 
  filter(feed == "horsebean" | feed == "linseed") %>% 
  var.test(weight ~ feed, data = .)

chickwts_f
## 
##  F test to compare two variances
## 
## data:  weight by feed
## F = 0.54679, num df = 9, denom df = 11, p-value = 0.3739
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.1523986 2.1390857
## sample estimates:
## ratio of variances 
##          0.5467906
# Retain the null hypothesis variances are equal
# override the default in t.test (var.equal = FALSE)
# t.test()

chickwts_t <- chickwts %>% 
  filter(feed == "horsebean" | feed == "linseed") %>% 
  t.test(weight ~ feed, data = ., var.equal = TRUE)

chickwts_t
## 
##  Two Sample t-test
## 
## data:  weight by feed
## t = -2.934, df = 20, p-value = 0.008205
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -100.17618  -16.92382
## sample estimates:
## mean in group horsebean   mean in group linseed 
##                  160.20                  218.75
## OR:

#chick_simple <- chickwts %>% 
 # var.test(weight ~ feed, data = .)  # something wrong with the lines

#t_test <- chick_simple %>% 
 # t.test(weight ~ feed, data = ., var.equal = TRUE)

#t_test

chick_simple <- chickwts %>% 
  filter(feed == "horsebean" | feed == "linseed")

f_test <- chick_simple %>% 
  var.test(weight ~ feed, data = .)

t_test <- chick_simple %>% 
  t.test(weight ~ feed, data = .)

t_test
## 
##  Welch Two Sample t-test
## 
## data:  weight by feed
## t = -3.0172, df = 19.769, p-value = 0.006869
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -99.0597 -18.0403
## sample estimates:
## mean in group horsebean   mean in group linseed 
##                  160.20                  218.75

p value means, if there is no difference between the two population, that the probability of the two mean are different is p.

Mean weights of chickens fed horsebean (mean sd) and linseed (mean sd) differ significantly (t(20) = r round(chickwts_t$statistic,2), p = 0.008, \(\alpha\)=0.05).

Part2. Power analysis

We need to collect samples to test a hypothesis (planning on doing a 2-sample t-test) to see if there is a difference in means for phosphate concentrations in lagoons downstream from golf courses of NOT downstream from golf courses.

Find n for “small” effect size (~0.2), and a “large” effect size (~0.8) as endpoints for n estimations

# A priori power analysis

# How many samples needed if there is a SMALL effect size, and alpha = 0.05. power = 0.80

power_small <- pwr.t.test(n = NULL, power = 0.8, sig.level = 0.05, d = 0.2)
power_small
## 
##      Two-sample t test power calculation 
## 
##               n = 393.4057
##               d = 0.2
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
power_large <- pwr.t.test(n = NULL, power = 0.8, sig.level = 0.05, d = 0.8)
power_large
## 
##      Two-sample t test power calculation 
## 
##               n = 25.52458
##               d = 0.8
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group

Part3. Piping into ggplot

mortality <- read_csv("drug_mortality.csv")
## Parsed with column specification:
## cols(
##   year = col_integer(),
##   sex = col_character(),
##   ages = col_character(),
##   race_and_hispanic_origin = col_character(),
##   state = col_character(),
##   deaths = col_integer(),
##   population = col_integer(),
##   death_rate = col_double()
## )
income <- read_csv("state_income.csv")
## Parsed with column specification:
## cols(
##   state = col_character(),
##   med_income = col_integer()
## )
mortality_1 <- mortality %>% 
  filter(ages == "All Ages",
         sex == "Both Sexes",
         race_and_hispanic_origin == "All Races-All Origins",
         state != "United States") %>% 
  select(year, state:death_rate)

m15 <- mortality_1 %>% 
  filter(year == 2015) %>% 
  mutate(highlight = ifelse(state == "Kentucky", "Yes", "No")) %>% 
  arrange(-death_rate) %>% 
  head(10) %>% 
  ggplot(aes(x = reorder(state, -death_rate), y = death_rate))+
  geom_col(aes(fill = highlight))+
  scale_fill_manual(values = c("gray60", "red"))

m15

ggplotly(m15)
# graph_1

4. Interactive bubble plot

We’ll use full_join (dyplyr) - this means that you retain ALL observations, even if the level only exists in one of the data frames

View(mortality_1)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/modules/R_de.so'' had status 1
mi_2015 <- full_join(mortality_1, income) %>% 
  filter(year == 2015)
## Joining, by = "state"
graph_3 <-  ggplot(filter(mi_2015, med_income > 60000), 
         aes(x = med_income,
             y = death_rate,
             label = state))+
  geom_point()+
  geom_text_repel()

graph_3 <-  ggplot(filter(mi_2015), 
         aes(x = med_income,
             y = death_rate,
             label = state))+
  geom_point(aes(size = population, color = state), alpha = 0.4)+
  geom_text_repel(size = 3)+
  theme(legend.position = "NA")

graph_3

ggplotly(graph_3)
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomTextRepel() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues

Part5. table using kable/kableExtra

ca_table <- mortality_1 %>% 
  filter(year >= 2010,
         state == "California")

ca_table
## # A tibble: 6 x 5
##    year state      deaths population death_rate
##   <int> <chr>       <int>      <int>      <dbl>
## 1  2010 California   4057   37253956       10.9
## 2  2011 California   4180   37691912       11.1
## 3  2012 California   4040   38041430       10.6
## 4  2013 California   4452   38332521       11.6
## 5  2014 California   4521   38802500       11.7
## 6  2015 California   4659   39144818       11.9
ca_final <- kable(ca_table) %>% 
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

ca_final
year state deaths population death_rate
2010 California 4057 37253956 10.9
2011 California 4180 37691912 11.1
2012 California 4040 38041430 10.6
2013 California 4452 38332521 11.6
2014 California 4521 38802500 11.7
2015 California 4659 39144818 11.9